Purtell_ProblemSet05

Author

Colin Purtell

STATS 506 Problem Set 5:

Link to repository: https://github.com/cspurtell/stat506

Problem 1

library(tidyverse)
Warning: package 'ggplot2' was built under R version 4.4.3
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   4.0.0     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(plotly)
Warning: package 'plotly' was built under R version 4.4.3

Attaching package: 'plotly'

The following object is masked from 'package:ggplot2':

    last_plot

The following object is masked from 'package:stats':

    filter

The following object is masked from 'package:graphics':

    layout

a.

Class definition:

setClass(
  Class = "waldCI",
  slots = list(
    lb = "numeric",
    ub = "numeric",
    level = "numeric"
  )
)

Validity function:

setValidity("waldCI", function(object) {
  msgs <- character()
  if (length(object@lb) != 1) {
    msgs <- c(msgs, "lb must be a numeric of length 1.")
    }
  if (length(object@ub) != 1) {
    msgs <- c(msgs, "ub must be a numeric of length 1.")
    }
  if (length(object@level) != 1) {
    msgs <- c(msgs, "level must be a numeric of length 1.")
    }
  if (is.na(object@lb) || is.na(object@ub) || is.na(object@level)) {
    msgs <- c(msgs, "lb, ub, and level must not be NA.")
    }
  if (!is.finite(object@lb) || !is.finite(object@ub)) {
    msgs <- c(msgs, "lb and ub must be finite.")
    }
  if (object@level <= 0 || object@level >= 1) {
    msgs <- c(msgs, "level must be strictly between 0 and 1.")
    }
  if (object@lb >= object@ub) {
    msgs <- c(msgs, "lb must be strictly less than ub.")
    }
  if (length(msgs) == 0) {
    TRUE 
    } else msgs
})
Class "waldCI" [in ".GlobalEnv"]

Slots:
                              
Name:       lb      ub   level
Class: numeric numeric numeric

Constructor:

makeWaldCI <- function(level, mean = NULL, sterr = NULL, lb = NULL, ub = NULL) {
  if (!is.null(mean) || !is.null(sterr)) {
    if (!is.numeric(sterr) || length(sterr) != 1 || is.na(sterr) || sterr <= 0) {
      stop("sterr must be a single positive number.")
    }
    
    if (!is.numeric(mean) || length(mean) != 1 || is.na(mean)) {
      stop("mean must be a single numeric value.")
    }
    
    z <- qnorm((1 + level) / 2)
    half_width <- z * sterr
    lb <- mean - half_width
    ub <- mean + half_width
    
  } else {
    if (is.null(lb) || is.null(ub)) {
      stop("You must supply either (mean, sterr) or (lb, ub).")
    }
  }
  
  obj <- new("waldCI", lb = lb, ub = ub, level = level)
  validObject(obj)
  obj
}

Show method:

setMethod("show", "waldCI", function(object) {
  cat(sprintf(
    "Wald CI (%.1f%%): [%.4f, %.4f]\n",
    100 * object@level,
    object@lb,
    object@ub
  ))
})

Accessors:

setGeneric("lb", function(x) standardGeneric("lb"))
[1] "lb"
setMethod("lb", "waldCI", function(x) x@lb)

setGeneric("ub", function(x) standardGeneric("ub"))
[1] "ub"
setMethod("ub", "waldCI", function(x) x@ub)

setGeneric("level", function(x) standardGeneric("level"))
[1] "level"
setMethod("level", "waldCI", function(x) x@level)

setMethod("mean", "waldCI", function(x, ...) {
  (x@lb + x@ub) / 2
})

setGeneric("sterr", function(x) standardGeneric("sterr"))
[1] "sterr"
setMethod("sterr", "waldCI", function(x) {
  z <- qnorm((1 + x@level) / 2)
  (x@ub - x@lb) / (2 * z)
})

Setters:

setGeneric("lb<-", function(x, value) standardGeneric("lb<-"))
[1] "lb<-"
setReplaceMethod("lb", "waldCI", function(x, value) {
  x@lb <- value
  validObject(x)
  x
})

setGeneric("ub<-", function(x, value) standardGeneric("ub<-"))
[1] "ub<-"
setReplaceMethod("ub", "waldCI", function(x, value) {
  x@ub <- value
  validObject(x)
  x
})

setGeneric("level<-", function(x, value) standardGeneric("level<-"))
[1] "level<-"
setReplaceMethod("level", "waldCI", function(x, value) {
  x@level <- value
  validObject(x)
  x
})

setGeneric("mean<-", function(x, value) standardGeneric("mean<-"))
[1] "mean<-"
setReplaceMethod("mean", "waldCI", function(x, value) {
  se <- sterr(x)
  z  <- qnorm((1 + x@level) / 2)
  half_width <- z * se
  x@lb <- value - half_width
  x@ub <- value + half_width
  validObject(x)
  x
})

setGeneric("sterr<-", function(x, value) standardGeneric("sterr<-"))
[1] "sterr<-"
setReplaceMethod("sterr", "waldCI", function(x, value) {
  if (value <= 0) stop("sterr must be positive.")
  m  <- mean(x)
  z  <- qnorm((1 + x@level) / 2)
  half_width <- z * value
  x@lb <- m - half_width
  x@ub <- m + half_width
  validObject(x)
  x
})

Contains method:

setGeneric("contains", function(object, value) standardGeneric("contains"))
Creating a new generic function for 'contains' in the global environment
[1] "contains"
setMethod("contains", signature(object = "waldCI", value = "numeric"),
          function(object, value) {
            (value >= object@lb) & (value <= object@ub)
          })

Overlap method:

setGeneric("overlap", function(x, y) standardGeneric("overlap"))
[1] "overlap"
setMethod("overlap", signature(x = "waldCI", y = "waldCI"),
          function(x, y) {
            max(lb(x), lb(y)) <= min(ub(x), ub(y))
          })

as.numeric method:

setMethod("as.numeric", "waldCI", function(x, ...) {
  c(lb(x), ub(x))
})

Monotonic transformation method:

setGeneric("transformCI", function(ci, f) standardGeneric("transformCI"))
[1] "transformCI"
setMethod("transformCI", signature(ci = "waldCI", f = "function"),
          function(ci, f) {
            # crude monotonicity check on [lb, mid, ub]
            xs <- c(lb(ci), mean(ci), ub(ci))
            ys <- sapply(xs, f)
            
            inc <- all(diff(ys) >= 0)
            dec <- all(diff(ys) <= 0)
            
            if (!(inc || dec)) {
              warning("transformCI: function does not appear monotonic on this interval; results may be meaningless.")
            }
            
            # transform endpoints and reorder if needed
            a <- f(lb(ci))
            b <- f(ub(ci))
            new_bounds <- sort(c(a, b))
            
            makeWaldCI(
              level = level(ci),
              lb    = new_bounds[1],
              ub    = new_bounds[2]
            )
          })

b.

Testing the constructor and parameters:

ci1 <- makeWaldCI(level = 0.95, lb = 17.2, ub = 24.7)
ci2 <- makeWaldCI(level = 0.99, mean = 13, sterr = 2.5)
ci3 <- makeWaldCI(level = 0.75, lb = 27.43, ub = 39.22)

ci1
Wald CI (95.0%): [17.2000, 24.7000]
ci2
Wald CI (99.0%): [6.5604, 19.4396]
ci3
Wald CI (75.0%): [27.4300, 39.2200]

Testing as.numeric:

as.numeric(ci1)
[1] 17.2 24.7
as.numeric(ci2)
[1]  6.560427 19.439573
as.numeric(ci3)
[1] 27.43 39.22

Testing accessors:

lb(ci2)
[1] 6.560427
ub(ci2)
[1] 19.43957
mean(ci1)
[1] 20.95
sterr(ci3)
[1] 5.12453
level(ci2)
[1] 0.99

Testing setters:

lb(ci2) <- 10.5
mean(ci3) <- 34
level(ci3) <- .8

Testing contains and overlap method:

contains(ci1, 17)
[1] FALSE
contains(ci3, 44)
[1] FALSE
overlap(ci1, ci2)
[1] TRUE

Testing a monotonic transformation:

eci1 <- transformCI(ci1, sqrt)
eci1
Wald CI (95.0%): [4.1473, 4.9699]
mean(transformCI(ci2, log))
[1] 2.659343

c.

Testing negative standard error:

try(makeWaldCI(level = 0.95, mean = 10, sterr = -1))
Error in makeWaldCI(level = 0.95, mean = 10, sterr = -1) : 
  sterr must be a single positive number.

Testing lb > ub:

try(makeWaldCI(level = 0.95, lb = 5, ub = 2))
Error in validObject(.Object) : 
  invalid class "waldCI" object: lb must be strictly less than ub.

Testing infinite bounds:

try(makeWaldCI(level = 0.95, lb = -Inf, ub = 5))
Error in validObject(.Object) : 
  invalid class "waldCI" object: lb and ub must be finite.
try(makeWaldCI(level = 0.95, lb = 0,   ub = Inf))
Error in validObject(.Object) : 
  invalid class "waldCI" object: lb and ub must be finite.

Testing bad/improper setters:

bad <- ci1
try(lb(bad) <- 50)
Error in validObject(x) : 
  invalid class "waldCI" object: lb must be strictly less than ub.
bad2 <- ci1
try(level(bad2) <- 1.5)
Error in validObject(x) : 
  invalid class "waldCI" object: level must be strictly between 0 and 1.
bad3 <- ci1
try(sterr(bad3) <- -0.1)
Error in `sterr<-`(`*tmp*`, value = -0.1) : sterr must be positive.

Problem 2

usa <- read.csv("data/us-states.csv") %>% 
  mutate(date = as.Date(date))

a.

max_cases <- max(usa$cases_avg_per_100k, na.rm = TRUE)
major_cutoff <- 0.75 * max_cases
minor_cutoff <- 0.40 * max_cases

usa_spikes <- usa %>%
  mutate(
    spike_type = case_when(
      cases_avg_per_100k >= major_cutoff ~ "Major spike",
      cases_avg_per_100k >= minor_cutoff ~ "Minor spike",
      TRUE ~ NA_character_
    )
  )
p_a <- usa_spikes %>%
  plot_ly(
    x = ~date,
    y = ~cases_avg_per_100k,
    type = "scatter",
    mode = "lines",
    line = list(color = "rgba(100,100,100,0.8)"),
    name = "7-day average"
  ) %>%
  # add points only where spike_type is not NA
  add_markers(
    data = subset(usa_spikes, !is.na(spike_type)),
    x = ~date,
    y = ~cases_avg_per_100k,
    color = ~spike_type,
    colors = c("firebrick", "goldenrod"),
    marker = list(size = 6, opacity = 0.8),
    name = ~spike_type
  ) %>%
  layout(
    title = list(
      text = paste0(
        "U.S. COVID-19: Major and Minor Spikes (7-day average per 100k)",
        "<br><sub>Major ≥ ", round(major_cutoff, 1),
        " per 100k | Minor ≥ ", round(minor_cutoff, 1), "</sub>"
      )
    ),
    xaxis = list(title = ""),
    yaxis = list(title = "New cases per 100k (7-day avg)"),
    legend = list(orientation = "h", x = 0, y = 1.1),
    margin = list(t = 70),
    annotations = list(
      x = 1, y = -0.15, xref = "paper", yref = "paper",
      text = "Source: NYTimes COVID-19 Rolling Averages",
      showarrow = FALSE, xanchor = "right", yanchor = "auto"
    )
  )

p_a
A line object has been specified, but lines is not in the mode
Adding lines to the mode...
A line object has been specified, but lines is not in the mode
Adding lines to the mode...
spike_counts <- usa_spikes %>%
  mutate(
    is_spike = !is.na(spike_type),
    group_id = cumsum(is.na(lag(spike_type)) & !is.na(spike_type))
  ) %>%
  filter(is_spike) %>%
  group_by(spike_type, group_id) %>%
  summarise(
    start_date = min(date),
    end_date = max(date),
    .groups = "drop"
  ) %>%
  group_by(spike_type) %>%
  summarise(n_spikes = n(), .groups = "drop")

spike_counts
# A tibble: 2 × 2
  spike_type  n_spikes
  <chr>          <int>
1 Major spike       28
2 Minor spike      433

According to the results from our filtering criteria, there were 28 major spikes and 433 minor spikes.

b.

state_rates <- usa %>%
  group_by(state) %>%
  summarise(mean_rate = mean(cases_avg_per_100k, na.rm = TRUE), .groups = "drop")

top_states <- state_rates %>% slice_max(mean_rate, n = 5) %>% pull(state)
bottom_states <- state_rates %>% slice_min(mean_rate, n = 5) %>% pull(state)

plot_data <- usa %>%
  filter(state %in% c(top_states, bottom_states)) %>%
  mutate(
    category = case_when(
      state %in% top_states ~ "Highest overall rate",
      state %in% bottom_states ~ "Lowest overall rate"
    )
  )
p_top <- plot_data %>%
  filter(category == "Highest overall rate") %>%
  plot_ly(
    x = ~date,
    y = ~cases_avg_per_100k,
    color = ~state,
    type = "scatter",
    mode = "lines",
    hovertemplate = paste(
      "<b>%{color}</b><br>",
      "%{x}<br>",
      "Cases per 100k: %{y:.1f}<extra></extra>"
    )
  ) %>%
  layout(
    title = list(text = "Highest overall rate"),
    xaxis = list(title = ""),
    yaxis = list(title = "Cases per 100k (7-day avg)")
  )

p_bottom <- plot_data %>%
  filter(category == "Lowest overall rate") %>%
  plot_ly(
    x = ~date,
    y = ~cases_avg_per_100k,
    color = ~state,
    type = "scatter",
    mode = "lines",
    showlegend = FALSE,  # share legend from top
    hovertemplate = paste(
      "<b>%{color}</b><br>",
      "%{x}<br>",
      "Cases per 100k: %{y:.1f}<extra></extra>"
    )
  ) %>%
  layout(
    title = list(text = "Lowest overall rate"),
    xaxis = list(title = ""),
    yaxis = list(title = "Cases per 100k (7-day avg)")
  )

p_b <- subplot(
  p_top, p_bottom,
  nrows = 2,
  shareX = TRUE,
  titleY = TRUE,
  margin = 0.07
) %>%
  layout(
    title = list(
      text = paste0(
        "COVID-19 Trajectories: States with Highest and Lowest Average Case Rates",
        "<br><sub>7-day rolling average new cases per 100k (Top 5 vs. Bottom 5 states)</sub>"
      )
    ),
    legend = list(orientation = "h", x = 0, y = 1.15),
    annotations = list(
      x = 1, y = -0.12, xref = "paper", yref = "paper",
      text = "Source: NYTimes COVID-19 Rolling Averages",
      showarrow = FALSE, xanchor = "right"
    ),
    margin = list(t = 80)
  )

p_b

c.

first_substantial <- usa %>%
  group_by(state) %>%
  arrange(date, .by_group = TRUE) %>%
  mutate(
    above = cases_avg_per_100k >= 1.0,
    run = ave(above, cumsum(!above), FUN = seq_along)
  ) %>%
  filter(above & run >= 3) %>%
  summarise(first_date = min(date), .groups = "drop") %>%
  arrange(first_date)

first_five <- head(first_substantial, 5)

cov_plot <- usa %>%
  inner_join(first_five, by = "state") %>%
  filter(date >= first_date & date <= first_date + 60)

onset_lines <- first_five %>%
  mutate(first_date = as.Date(first_date))

states_order <- onset_lines$state
plots_list <- lapply(states_order, function(st) {
  dat <- cov_plot %>% filter(state == st)
  fd  <- onset_lines$first_date[onset_lines$state == st]
  max_y <- max(dat$cases_avg_per_100k, na.rm = TRUE)
  
  plot_ly(
    data = dat,
    x = ~date,
    y = ~cases_avg_per_100k,
    type = "scatter",
    mode = "lines",
    name = st,
    showlegend = FALSE
  ) %>%
    add_segments(
      x = fd, xend = fd,
      y = 0, yend = max_y,
      line = list(color = "rgba(80,80,80,0.9)", dash = "dash"),
      inherit = FALSE,
      showlegend = FALSE
    ) %>%
    layout(
      title = list(text = st),
      xaxis = list(title = ""),
      yaxis = list(title = "Cases per 100k (7-day avg)")
    )
})
p_c <- subplot(
  plots_list,
  nrows = 2,  # 2 rows x 3 cols (last cell empty) or 1x5, tweak as needed
  margin = 0.05,
  shareX = FALSE,
  shareY = FALSE,
  titleY = TRUE
) %>%
  layout(
    title = list(
      text = paste0(
        "Earliest Substantial COVID Activity by State",
        "<br><sub>First 5 states to exceed 1.0 cases per 100k (7-day avg) for 3+ consecutive days</sub>"
      )
    ),
    annotations = list(
      x = 1, y = -0.1, xref = "paper", yref = "paper",
      text = "Source: NYTimes COVID-19 Rolling Averages",
      showarrow = FALSE, xanchor = "right"
    ),
    margin = list(t = 80)
  )

p_c

Resources Used:

Plotly documentation, setGeneric and setMethod documentation, ChatGPT for debugging purposes